{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 第四章 逻辑回归模型 - 股票客户流失预警模型"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1.案例实战 - 股票客户流失预警模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>账户资金（元）</th>\n",
       "      <th>最后一次交易距今时间（天）</th>\n",
       "      <th>上月交易佣金（元）</th>\n",
       "      <th>累计交易佣金（元）</th>\n",
       "      <th>本券商使用时长（年）</th>\n",
       "      <th>是否流失</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>22686.5</td>\n",
       "      <td>297</td>\n",
       "      <td>149.25</td>\n",
       "      <td>2029.85</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>190055.0</td>\n",
       "      <td>42</td>\n",
       "      <td>284.75</td>\n",
       "      <td>3889.50</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>29733.5</td>\n",
       "      <td>233</td>\n",
       "      <td>269.25</td>\n",
       "      <td>2108.15</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>185667.5</td>\n",
       "      <td>44</td>\n",
       "      <td>211.50</td>\n",
       "      <td>3840.75</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>33648.5</td>\n",
       "      <td>213</td>\n",
       "      <td>353.50</td>\n",
       "      <td>2151.65</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    账户资金（元）  最后一次交易距今时间（天）  上月交易佣金（元）  累计交易佣金（元）  本券商使用时长（年）  是否流失\n",
       "0   22686.5            297     149.25    2029.85           0     0\n",
       "1  190055.0             42     284.75    3889.50           2     0\n",
       "2   29733.5            233     269.25    2108.15           0     1\n",
       "3  185667.5             44     211.50    3840.75           3     0\n",
       "4   33648.5            213     353.50    2151.65           0     1"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 1.读取数据\n",
    "import pandas as pd\n",
    "df = pd.read_excel('股票客户流失.xlsx')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2.划分特征变量和目标变量\n",
    "X = df.drop(columns='是否流失') \n",
    "y = df['是否流失']   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>账户资金（元）</th>\n",
       "      <th>最后一次交易距今时间（天）</th>\n",
       "      <th>上月交易佣金（元）</th>\n",
       "      <th>累计交易佣金（元）</th>\n",
       "      <th>本券商使用时长（年）</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1814</th>\n",
       "      <td>43251.5</td>\n",
       "      <td>192</td>\n",
       "      <td>98.50</td>\n",
       "      <td>2258.35</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5946</th>\n",
       "      <td>304449.5</td>\n",
       "      <td>22</td>\n",
       "      <td>369.50</td>\n",
       "      <td>5160.55</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3881</th>\n",
       "      <td>441357.5</td>\n",
       "      <td>9</td>\n",
       "      <td>325.75</td>\n",
       "      <td>6681.75</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2389</th>\n",
       "      <td>587076.5</td>\n",
       "      <td>2</td>\n",
       "      <td>427.25</td>\n",
       "      <td>8300.85</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3676</th>\n",
       "      <td>204027.5</td>\n",
       "      <td>39</td>\n",
       "      <td>352.00</td>\n",
       "      <td>4044.75</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       账户资金（元）  最后一次交易距今时间（天）  上月交易佣金（元）  累计交易佣金（元）  本券商使用时长（年）\n",
       "1814   43251.5            192      98.50    2258.35           0\n",
       "5946  304449.5             22     369.50    5160.55           3\n",
       "3881  441357.5              9     325.75    6681.75           5\n",
       "2389  587076.5              2     427.25    8300.85           5\n",
       "3676  204027.5             39     352.00    4044.75           2"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 3.划分训练集和测试集\n",
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)  # 设置random_state使得每次划分的数据一样\n",
    "\n",
    "X_train.head()  # 显示训练集X_train的前5行，在别的代码编辑器里需要通过print()函数打印查看\n",
    "# y_train.head()  # 显示训练集y_train的前5行，在别的代码编辑器里需要通过print()函数打印查看\n",
    "# X_test.head()  # 显示测试集X_test的前5行，在别的代码编辑器里需要通过print()函数打印查看\n",
    "# y_test.head()  # 显示测试集y_test的前5行，在别的代码编辑器里需要通过print()函数打印查看"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
       "                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
       "                   multi_class='auto', n_jobs=None, penalty='l2',\n",
       "                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n",
       "                   warm_start=False)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 4.模型搭建\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "model = LogisticRegression()\n",
    "model.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      " 0 0 0 0 0 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0\n",
      " 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1]\n"
     ]
    }
   ],
   "source": [
    "# 5.模型使用1 - 预测数据结果\n",
    "y_pred = model.predict(X_test)\n",
    "print(y_pred[0:100])  # 打印预测内容的前100个看看"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>预测值</th>\n",
       "      <th>实际值</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   预测值  实际值\n",
       "0    0    0\n",
       "1    0    0\n",
       "2    0    0\n",
       "3    0    1\n",
       "4    0    0"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 放到一个DataFrame里进行查看比对\n",
    "a = pd.DataFrame()  # 创建一个空DataFrame \n",
    "a['预测值'] = list(y_pred)\n",
    "a['实际值'] = list(y_test)\n",
    "a.head()  # 可以看到此时前5个预测准确度为80%"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7977288857345636\n"
     ]
    }
   ],
   "source": [
    "# 查看全部的预测准确度\n",
    "from sklearn.metrics import accuracy_score\n",
    "score = accuracy_score(y_pred, y_test)\n",
    "print(score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7977288857345636"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 另外一种查看模型预测准确度的方法\n",
    "model.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.82041491, 0.17958509],\n",
       "       [0.84029613, 0.15970387],\n",
       "       [0.79819342, 0.20180658],\n",
       "       [0.62989192, 0.37010808],\n",
       "       [0.61636611, 0.38363389]])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 6.模型使用2 - 预测概率\n",
    "y_pred_proba = model.predict_proba(X_test)  \n",
    "y_pred_proba[0:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>不流失概率</th>\n",
       "      <th>流失概率</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.820415</td>\n",
       "      <td>0.179585</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.840296</td>\n",
       "      <td>0.159704</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.798193</td>\n",
       "      <td>0.201807</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.629892</td>\n",
       "      <td>0.370108</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.616366</td>\n",
       "      <td>0.383634</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      不流失概率      流失概率\n",
       "0  0.820415  0.179585\n",
       "1  0.840296  0.159704\n",
       "2  0.798193  0.201807\n",
       "3  0.629892  0.370108\n",
       "4  0.616366  0.383634"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 另一种查看概率的方式\n",
    "a = pd.DataFrame(y_pred_proba, columns=['不流失概率', '流失概率'])\n",
    "a.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.17958509, 0.15970387, 0.20180658, ..., 0.04220544, 0.09782449,\n",
       "       0.63586739])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 只查看流失的概率（也即y=1概率，即上面二维数组的第二列）\n",
    "y_pred_proba[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 2.41952469e-05,  8.16881490e-03,  1.04320950e-02,\n",
       "        -2.54894468e-03, -1.10120609e-04]])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 7.查看各个特征变量的系数（额外知识点，供参考）\n",
    "model.coef_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-1.43393291e-06])"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.intercept_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.17958509]\n",
      "[0.15970387]\n",
      "[0.20180658]\n",
      "[0.37010808]\n",
      "[0.38363389]\n"
     ]
    }
   ],
   "source": [
    "# 通过公式获取流失概率\n",
    "import numpy as np\n",
    "for i in range(5):  # 这里演示前5条测试集数据的预测流失的概率\n",
    "    print(1 / (1 + np.exp(-(np.dot(X_test.iloc[i], model.coef_.T) + model.intercept_))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      " 0 0 0 0 0 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0\n",
      " 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1]\n",
      "0.7977288857345636\n",
      "[[0.82041491 0.17958509]\n",
      " [0.84029613 0.15970387]\n",
      " [0.79819342 0.20180658]\n",
      " [0.62989192 0.37010808]\n",
      " [0.61636611 0.38363389]]\n"
     ]
    }
   ],
   "source": [
    "# 代码汇总\n",
    "# 1.读取数据\n",
    "import pandas as pd\n",
    "df = pd.read_excel('股票客户流失.xlsx')\n",
    "\n",
    "# 2.划分特征变量和目标变量\n",
    "X = df.drop(columns='是否流失') \n",
    "y = df['是否流失']\n",
    "\n",
    "# 3.划分训练集和测试集\n",
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)\n",
    "\n",
    "# 4.模型搭建\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "model = LogisticRegression()\n",
    "model.fit(X_train, y_train)\n",
    "\n",
    "# 5.模型使用1 - 预测数据结果\n",
    "y_pred = model.predict(X_test)\n",
    "print(y_pred[0:100])  # 打印预测内容的前100个看看\n",
    "\n",
    "# 查看全部的预测准确度\n",
    "from sklearn.metrics import accuracy_score\n",
    "score = accuracy_score(y_pred, y_test)\n",
    "print(score)  # 打印整体的预测准确度\n",
    "\n",
    "# 6.模型使用2 - 预测概率\n",
    "y_pred_proba = model.predict_proba(X_test)  \n",
    "print(y_pred_proba[0:5])  # 打印前5个客户的分类概率"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
